from sklearn.exceptions import ConvergenceWarning
from warnings import simplefilter
import warnings
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import re
import itertools
from collections import Counter, OrderedDict
# NLP
from nltk.stem import WordNetLemmatizer
from nltk.corpus import wordnet, stopwords
from nltk.tokenize import RegexpTokenizer
from PIL import Image
from wordcloud import WordCloud
# Sklearn
from sklearn.decomposition import PCA, TruncatedSVD, NMF
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.preprocessing import scale, StandardScaler
from sklearn.cluster import KMeans, MeanShift, estimate_bandwidth
from sklearn.datasets import make_blobs
from sklearn.utils import shuffle
from sklearn.metrics import davies_bouldin_score
from sklearn.cluster import AgglomerativeClustering
from scipy.cluster.hierarchy import dendrogram
proxies = {'http': 'http://206.189.157.23'}
abspath_song_info = ("/mnt/processed/private/msds2023/lt2/DMW1_Project/"
"all_songs.pkl")
%matplotlib inline
# Pandas settings
pd.options.display.float_format = '{:,.2f}'.format
pd.set_option('display.max_rows', None)
pd.set_option('display.max_colwidth', None)
pd.set_option('mode.chained_assignment', None)
# Error Filters
warnings.filterwarnings("ignore", category=RuntimeWarning)
warnings.filterwarnings("ignore", category=ConvergenceWarning)
warnings.filterwarnings("ignore", category=FutureWarning)
# Colors
c_black = '#000000'
c_gray = '#657786'
c_blue = '#1a48c4'
c_white = '#ffffff'
c_red = '#b91c20'
c_green = '#00e091'
stopwords = ['out', 'wont', 'for', 'shan', 'mightn', 'mightnt', 'ya', 're',
'itself', 'you', 'aren', 'off', 'id', 'que', 'doing', 'nor',
'ooh', 'un', 'weren', 'are', 'uh', 's', 'between', 'your',
'that', 'they', 'as', 'ill', 'all', 'ahh', 'does', 'she',
'thatll', 'can', 'isn', 'my', 'ourselves', 'ain', 'hadn',
'being', 'to', 'a', 'if', 'any', 'arent', 'down', 'have', 'has',
'having', 've', 'because', 'y', 'the', 'll', 'needn', 'mi',
'both', 'no', 'what', 'other', 'those', 'these', 'herself', 'do',
'then', 'of', 'been', 'didn', 'la', 'again', 'yourself', 'under',
'mustn', 'ma', 'couldnt', 'woah', 'hee', 'le', 'o', 'had', 'am',
'neednt', 'd', 'oh', 'which', 'myself', 'by', 'each', 'with',
'wouldnt', 'da', 'and', 'but', 'how', 'los', 'ours', 'hers',
'against', 'only', 'ayy', 'hes', 'ah', 'lo', 'until', 'hasnt',
'hey', 'before', 'an', 'now', 'than', 'up', 'when', 'himself',
'yeah', 'yours', 'more', 'werent', 'i', 'dont', 'wasn', 'there',
'further', 'whoa', 'wasnt', 'at', 'haven', 'in', 'while', 'is',
'through', 'above', 'from', 'hu', 'their', 'youve', 'na', 'most',
'them', 'wouldn', 'or', 'why', 'isnt', 'youre', 'me', 'same',
'where', 'shes', 'we', "i'll", 'tu', 'don', 'im', 'de', 'ese',
'just', 'very', 'during', 'shouldve', 'm', 'havent', 'be',
'ho', 'yo', 'it', "i'm", 'ive', 'themselves', 'couldn', 'her',
'youll', 'this', 'shouldnt', 'after', 'ay', 'haa', 'en', 'once',
'hadnt', 'own', 'theirs', 'its', 'will', 'about', 'some', 'es',
'youd', 'el', 'our', 'him', 'mustnt', 'not', 'did', 'were',
'mmm', 'em', 'on', 'didnt', 'whom', 'below', 'won', 'his', 'he',
'over', 'te', 'here', 'such', 'too', 'should', 'few', 'shant',
'was', 'into', 'hasn', 'yourselves', 'who', 't', 'so', 'doesn',
'ha', 'doesnt', 'eh', 'shouldn', 'ye', 'pa', 'si', 'se', 'nah',
'aint', 'huh', 'cause', 'one', 'well', 'let', 'thats', 'could',
'every', 'even', 'gotta', 'around', 'gonna', 'really', 'gon',
'ever', 'bout', 'us', 'lets', 'theres', 'two', 'much',
'whats', 'might', 'til', 'ima', 'another', 'something'
]
def preprocess_lyrics(lyrics):
"""Returns cleaned lyrics."""
# Convert to lowercase
lyrics = lyrics.lower()
# Removes titles
lyrics = lyrics.split('yrics\n', 1)[-1]
# Removes 'Embed from the end'
lyrics = lyrics.rsplit('embed', 1)[0]
# Removes digits before embed
lyrics = lyrics.rstrip('1234567890')
# Removes 'you might also like'
lyrics = lyrics.rsplit('you might also like', 1)[0]
# Replace dash with space
lyrics = (re.sub("[-]", " ", lyrics))
# Remove Non-letters
lyrics = (re.sub("[^\sA-Za-z]", "", lyrics))
# Standardize whitespace
lyrics = (re.sub("[\s]", " ", lyrics))
# Lemmatizer
lemma = WordNetLemmatizer()
for pos in [wordnet.NOUN, wordnet.VERB, wordnet.ADJ, wordnet.ADV]:
lyrics_clean = " ".join(lemma.lemmatize(word, pos)
for word in lyrics.split())
return lyrics_clean
def df_info(df):
"""Returns the dimensions, info, and statistics of a given data frame."""
# Get the dimensions of the df
print(f'Number of rows: {df.shape[0]}\n'
f'Number of columns: {df.shape[1]}\n')
# List the datatype of each column and count of non-null values
print(f'List of all columns, count of non-null values, and datatypes of '
'the df:\n')
display(df.info())
# Get summary statistics for the numerical columns and transpose the DataFrame
# for more readable output
print(f'\nSummary statistics for numerical columns:\n')
display(df.describe())
# Get summary statistics for the object (string) columns
print(f'Summary statistics for object(string) columns:\n')
display(df.describe(include=[object]).T)
# Get count of null values
print(f'Check for null columns:\n')
display(df.isna().sum())
# display % of null values per column
print(f'Display % of null columns:\n')
display(df.isna().sum() / (len(df))*100)
def pairwise(iterable):
"""Function used for pairwise iteration."""
# pairwise('ABCDEFG') --> AB BC CD DE EF FG
a, b = itertools.tee(iterable)
next(b, None)
return zip(a, b)
def plot_word_cloud(nmf_model, feature_names, k=300, n_topics=4):
"""Return an n_topics-2 rows by 3 columns wordclouds of the words
per topic."""
topic_word_list = []
for index, topic in enumerate(nmf_model.components_):
terms_comp = zip(feature_names, topic)
sorted_terms = sorted(
terms_comp, key=lambda x: x[1], reverse=True)[:k]
topic = " "
for t in sorted_terms:
topic = topic + ' ' + t[0]
topic_word_list.append(topic)
ncols = 3
nrows = 2
fig = plt.figure(figsize=(15, 10))
for i in range(n_topics):
ax = fig.add_subplot(nrows, ncols, i+1)
wc = WordCloud(background_color='white',
colormap='winter',
max_words=k,
contour_color=c_blue,
contour_width=2,
font_step=2,
relative_scaling=0,
random_state=42).generate(topic_word_list[i])
plt.imshow(wc)
plt.title(f"Topic {i+1}", color=c_blue, fontsize=24, weight='bold')
plt.axis("off")
plt.tight_layout()
def get_tfidf(df):
"""Converts the song lyrics to a matrix of TF-IDF features. """
corpus = df['lyrics'].tolist()
vectorizer = TfidfVectorizer(
min_df=3, max_df=0.50, ngram_range=(1, 3), stop_words=stopwords)
X = vectorizer.fit_transform(corpus)
feature_names = vectorizer.get_feature_names_out()
return X, feature_names
def truncated_svd(X):
"""Return the Q, Sigma, and P SVD of the design matrix X as well as the
normalized sum of squared distance from the origin.
"""
q, s, p = np.linalg.svd(X)
s2 = s ** 2
return q, np.diag(s), p.T, (s2 / np.sum(s2))
def plot_sv(nssd, thresh=0.95, title=None):
"""Plot cumulative variance explained of a given nssd to have an idea
on the value of k."""
svs = (sum(nssd.cumsum() < thresh) + 1)
fig, ax = plt.subplots(figsize=(10, 5))
ax.plot(range(1, len(nssd)+1), nssd.cumsum(), c_green, label='cumulative',
linewidth=5)
ax.axhline(thresh, ls='--', color=c_red)
ax.axvline(svs, ls='--', color=c_red)
ax.set_xlim(0, len(nssd)+1)
ax.set_xlabel('SV', fontsize=16, c=c_black)
ax.set_ylabel('cumulative variance explained', fontsize=16, c=c_black)
ax.set_title(f'{title}: Retained {svs} SVs',
fontsize=24, color=c_black, weight='bold')
plt.tight_layout()
return svs
def project_svd(q, s, k):
"""Returns design matrix projected on to the first k singular vectors"""
return q[:, :k]@s[:k, :k]
def topics(rows, cols, features, p):
"""Plot top rxc topics based on the given matrix"""
p = np.asarray(p)
fig, ax = plt.subplots(nrows=rows, ncols=cols,
figsize=(7*cols, 7+rows*2),
sharex=True)
for i, ax in enumerate(ax.flat):
rank = np.argsort(np.abs(p[:, i]))[-10:]
ax.barh([features[r] for r in rank], p[rank, i], color=c_green)
ax.set_title(f'SV{i+1}', fontsize=14, c=c_black)
plt.suptitle(f'Top {rows*cols} topics uncovered', fontsize=24,
color=c_black, weight='bold')
plt.rc('legend', fontsize=20)
plt.tight_layout()
def get_top_100(soup):
"""Returns the rank, song, and artist of the top 100 songs from
Billboard.com"""
songs = [x.text.strip() for x in soup.select('li .c-title')]
artists = [x.text.strip() for x in soup.select('li .c-title ~ .c-label')]
return {rank: {'song': song, 'artist': artist} for rank,
(song, artist) in enumerate(zip(songs, artists), start=1)}
def artist_songs_stats():
"""Print average number of songs per artist and the artist with the most
number of songs in the dataset, then plot the histogram of the songs
per artist."""
songs_per_artist = OrderedDict(
Counter(df_all.artist).most_common()).values()
avg_song_per_artist = np.mean(list(songs_per_artist))
max_song_of_artist = np.max(list(songs_per_artist))
artist_most_n_songs = list(OrderedDict(
Counter(df_all.artist).most_common()))[0]
print(f'Average number of songs per artist: {avg_song_per_artist:.2f}')
print(f'{artist_most_n_songs} had the most number of songs with '
f'{max_song_of_artist:.0f} songs')
fig = plt.figure(figsize=(10, 5))
plt.hist(songs_per_artist, color=c_green, bins=24)
plt.xlabel("Number of songs", fontsize=16)
plt.ylabel("Number of artists", fontsize=16)
plt.title("Distribution of songs per artist", fontsize=24, weight='bold')
plt.xlim(0, 25)
plt.plot()
def n_words_in_lyrics():
"""Print average number of words per song lyrics and plots its
distribution."""
# Words per song
word_counts = [len(df_all.lyrics.str.split(' ')[i]) for i in df_all.index]
filtered = list(filter(lambda x: x < 900, word_counts))
avg_words_lyrics = np.mean(filtered)
print(f'Average words in song lyrics: {avg_words_lyrics:.0f} words')
fig, axs = plt.subplots(figsize=(10, 5))
axs.hist(filtered, bins=100, color=c_green)
axs.set_xlabel("Words in song", fontsize=16)
axs.set_ylabel("Number of songs", fontsize=16)
axs.set_title("Distribution of words per song lyrics",
fontsize=24, weight='bold')
# Hide the right and top spines
axs.spines['right'].set_visible(False)
axs.spines['top'].set_visible(False)
plt.show()
def get_countvec(df, n=10):
"""Returns the top words and how much it is used in the era."""
corpus = df['lyrics'].tolist()
vectorizer = CountVectorizer(
min_df=3, max_df=0.50, ngram_range=(1, 3), stop_words=stopwords)
X = vectorizer.fit_transform(corpus)
features = vectorizer.get_feature_names_out()
df_top_words = pd.DataFrame()
X = X.todense().astype(bool).sum(axis=0)
df_top_words['count'] = pd.DataFrame(X, columns=features).T.nlargest(n, 0)
df_top_words['% of songs'] = df_top_words['count'] / df.shape[0] * 100
return df_top_words
def top_words_per_era(n=5):
"""Prints the top words from all the three eras."""
dfs = [df_2006_2010, df_2011_2015, df_2016_2021]
eras = ["2006-2010", "2011-2015", "2016-2021"]
for era, df_cv in zip(eras, dfs):
print(f'{era} era')
display(get_countvec(df_cv, n))
def n_dimensions():
"""Print the number of dimensions of the lyrics data after performing
TF-IDF."""
print(f'All years n_dimensions: {X_all.shape[1]:,.0f}')
print(f'2006-2010 n_dimensions: {X_0610.shape[1]:,.0f}')
print(f'2011-2015 n_dimensions: {X_1115.shape[1]:,.0f}')
print(f'2016-2021 n_dimensions: {X_1621.shape[1]:,.0f}')
def updated_n_dimensions():
"""Print the number of dimensions of the lyrics data after performing
TF-IDF."""
print(f'All years n_dimensions: {X_all_lsa.shape[1]:,.0f}')
print(f'2006-2010 n_dimensions: {X_0610_lsa.shape[1]:,.0f}')
print(f'2011-2015 n_dimensions: {X_1115_lsa.shape[1]:,.0f}')
print(f'2016-2021 n_dimensions: {X_1621_lsa.shape[1]:,.0f}')
def optimal_k_nmf(X):
"""Returns a plot showing the optimal K for NMF."""
# Elbow
init = "nndsvda"
fig, ax = plt.subplots(figsize=(10, 5))
reconstruction_error = []
K = range(2, 11)
for k in K:
nmf = NMF(n_components=k,
random_state=42,
init=init,
beta_loss="frobenius",
alpha_W=0.00005,
alpha_H=0.00005,
l1_ratio=1)
nmf = nmf.fit(X)
reconstruction_error.append(nmf.reconstruction_err_)
k_optimal_nmf = np.argmin(reconstruction_error)+2
ax.plot(K, reconstruction_error, color=c_green, linewidth=5)
ax.set_xlabel('k', fontsize=16, c=c_black)
ax.set_ylabel('Reconstruction Error', fontsize=16, c=c_black)
ax.set_title('Elbow Method For Optimal k', fontsize=24, c=c_black,
weight='bold')
ax.axvline(k_optimal_nmf, ls='--', c=c_red)
return f'Optimal K for NMF Topics: {k_optimal_nmf}'
def nmf_topics(X, feature_names, n_words=15):
"""Returns the top 15 words for the K number of topics identified for NMF
and prints the top 15 words per topic.
"""
n_components = 6
nmf_model = NMF(n_components, beta_loss="frobenius",
random_state=42, max_iter=500)
U = nmf_model.fit_transform(X)
V = nmf_model.components_.T
for index, topic in enumerate(nmf_model.components_):
print(f"THE TOP {n_words} WORDS PER TOPIC {index+1}")
print([feature_names[i] for i in topic.argsort()[-n_words:]])
print('\n')
return nmf_model, U, V
def nmf_topics_era(X, feature_names, n_words=15):
"""Returns the top 15 words for the K number of topics identified for NMF
and prints the top 15 words per topic.
"""
n_components = 6
nmf_model = NMF(n_components, beta_loss="frobenius",
random_state=42, max_iter=500)
U = nmf_model.fit_transform(X)
V = nmf_model.components_.T
return nmf_model, U, V
def nmf_topics_all(df_all, years=[2006, 2011, 2016, 2022]):
"""DataFrame for topics per era"""
df_nmf = pd.DataFrame()
for x, y in pairwise(years):
df = (
df_all[df_all['year'].isin(range(x, y))]
.value_counts('topic')
.sort_index()
.to_frame()
)
df.columns = [f'{x}-{y-1}']
df_nmf = pd.concat([df_nmf, df], axis=1)
return df_nmf
def nmf_topics_year(df_all, years):
"""DataFrame for topics per era"""
df_nmf = pd.DataFrame()
for x in years:
df = (
df_all[df_all['year'] == x]
.value_counts('topic')
.sort_index()
.to_frame()
)
df.columns = [f'{x}']
df_nmf = pd.concat([df_nmf, df], axis=1).fillna(0).astype(int)
return df_nmf
def nmf_df(nmf_model, X, df, years):
"""Returns a plot of the most represented topics for each era"""
topic_results = nmf_model.transform(X)
df['topic'] = topic_results.argmax(axis=1)+1
df_nmf = nmf_topics_year(df, years)
df_nmf.T.plot(figsize=(10, 5))
plt.legend(loc='upper left')
plt.title("Most Represented Topic", fontsize=24, weight='bold')
plt.xlabel("Years", fontsize=24)
plt.ylabel("Number of Songs", fontsize=24)
plt.show()
return df_nmf
def display_topics(df, n_topics=6, samples=True, n_samples=10):
"""Sample per cluster and return dataframe with added topic column."""
if samples:
for i in range(1, n_topics+1):
print(f'Topic {i}')
try:
display(df
.reset_index().sort_values('rank')
.loc[pd.Series(df['topic']) == i,
['rank', 'song', 'artist']]
.head(n_samples))
# If less than n_samples
except:
display(df
.reset_index().sort_values('rank')
.loc[pd.Series(df['topic']) == i,
['rank', 'song', 'artist']]
.head())
# Davies Bouldin score for K means
def get_kmeans_score(data, center):
'''Returns kmeans score regarding Davies Bouldin for points to centers'''
# instantiate kmeans
kmeans = KMeans(n_clusters=center, random_state=42)
# Then fit the model to your data using the fit method
model = kmeans.fit_predict(data)
# Calculate Davies Bouldin score
score = davies_bouldin_score(data, model)
return score
def optimal_k(X, k_optimal=6):
"""Returns elbow plot to find optimal K using sum of squared distances."""
# Elbow
Sum_of_squared_distances = []
K = range(2, 11)
for k in K:
km = KMeans(n_clusters=k, random_state=42)
km = km.fit(X)
Sum_of_squared_distances.append(km.inertia_)
# Silhouette
from sklearn.metrics import silhouette_score
sil = []
# dissimilarity would not be defined for a single cluster, thus,
# minimum number of clusters should be 2
for k in K:
kmeans = KMeans(n_clusters=k, random_state=42).fit(X)
labels = kmeans.labels_
sil.append(silhouette_score(X, labels, metric='cosine'))
# Davies Bouldin score
scores = []
centers = list(K)
for center in centers:
scores.append(get_kmeans_score(X, center))
# Plotting
fig, ax = plt.subplots(1, 3, figsize=(20, 5))
style = '-og'
lw = 3
ms = 7
mfc = c_green
c = c_red
ax[0].plot(K, Sum_of_squared_distances, style, lw=lw, ms=ms, mfc=mfc)
ax[0].set_xlabel('k')
ax[0].set_ylabel('Sum of squared distances')
ax[0].set_title('Elbow Method For Optimal k', fontsize=24, weight='bold')
ax[0].axvline(k_optimal, ls='--', c=c)
ax[1].plot(K, sil, style, lw=lw, ms=ms, mfc=mfc)
ax[1].set_xlabel('k')
ax[1].set_ylabel('Silhouette score')
ax[1].set_title('Silhouette score vs. K', fontsize=24, weight='bold')
ax[1].axvline(k_optimal, ls='--', c=c)
ax[2].plot(centers, scores, style, lw=lw, ms=ms, mfc=mfc)
ax[2].set_xlabel('k')
ax[2].set_ylabel('Davies Bouldin score')
ax[2].set_title('Davies Bouldin score vs. K', fontsize=24, weight="bold")
ax[2].axvline(k_optimal, ls='--', c=c)
def nmf_kmeans(X, U, k=6):
"""Plot clustering using most represented NMF topics and K-means"""
fig, ax = plt.subplots(figsize=(10, 5))
pca = PCA(2, random_state=42)
pca_num = len(set(U.argmax(axis=1)))
kmeans = KMeans(k, random_state=42)
kmn_num = len(set(kmeans.fit_predict(U)))
ax.scatter(*pca.fit_transform(X.todense()).T,
c=kmeans.fit_predict(U), cmap='Set1')
ax.set_xlabel('PC1', fontsize=16)
ax.set_ylabel('PC2', fontsize=16)
ax.set_title(f'Clustering (K-Means) - {kmn_num} clusters', fontsize=24,
weight='bold')
return kmeans.fit_predict(U)
def plot_wcloud_km(df, k=300, n_clusters=6):
"""Return an n_topics-2 rows by 2 columns wordclouds of the words
per topic."""
ncols = int(np.ceil(n_clusters/2))
nrows = 2
fig = plt.figure(figsize=(14, nrows*(14/ncols)))
for i in range(n_clusters):
ax = fig.add_subplot(nrows, ncols, i+1)
df_i = df[df['k_cluster'] == i]
df_i_lyrics = df_i['lyrics'].apply(preprocess_lyrics)
corpus = df_i_lyrics.tolist()
try:
cv = CountVectorizer(stop_words=stopwords, min_df=3, max_df=0.50,
ngram_range=(1, 1))
except:
cv = CountVectorizer(stop_words=stopwords, min_df=3, max_df=0.80,
ngram_range=(1, 1))
x = cv.fit_transform(corpus).toarray()
feature_names = cv.get_feature_names_out()
bows = pd.DataFrame({'count': x.sum(axis=0), 'word': feature_names})
bows_sorted = bows.sort_values('count', ascending=False)[
'word'].tolist()
bow = " ".join(bows_sorted)
wc = WordCloud(background_color='white',
colormap='winter',
max_words=k,
contour_color=c_blue,
contour_width=3,
font_step=2,
relative_scaling=0,
random_state=42).generate(bow)
plt.title(f"Km_cluster {i+1}", color=c_red, fontsize=24,
weight='bold')
plt.imshow(wc)
plt.axis("off")
plt.tight_layout()
pl = bows.sort_values("count",ascending=False)["word"][:15].to_numpy()
print(f'Top 15 words for Km_cluster {i+1}:\n'
f'{pl}')
def display_k_clusters(df, n_clusters):
print(df.value_counts('k_cluster').sort_index())
for i in range(n_clusters):
print(f'Cluster {i+1}')
display(df.sort_values('rank').loc[df['k_cluster'] == i,
['song', 'artist']].head(10))
def plot_dendrogram(model, **kwargs):
"""Create linkage matrix and then plot the dendrogram"""
# create the counts of samples under each node
counts = np.zeros(model.children_.shape[0])
n_samples = len(model.labels_)
for i, merge in enumerate(model.children_):
current_count = 0
for child_idx in merge:
if child_idx < n_samples:
current_count += 1 # leaf node
else:
current_count += counts[child_idx - n_samples]
counts[i] = current_count
linkage_matrix = np.column_stack(
[model.children_, model.distances_, counts]
).astype(float)
# Plot the corresponding dendrogram
return dendrogram(linkage_matrix, **kwargs)
def display_clusters(X, df, n_clusters=6, samples=True, t=0.4, n_rows=3):
"""Display dendogram, cluster sizes, sample per cluster and
return dataframe with added cluster column."""
pca = PCA(2, random_state=42)
data = pca.fit_transform(X)
# setting distance_threshold=0 ensures we compute the full tree.
model = AgglomerativeClustering(distance_threshold=None,
n_clusters=n_clusters,
compute_distances=True,
affinity='cosine',
linkage='average'
)
model = model.fit(data)
# Plot
fig = plt.figure(figsize=(16, 5))
plt.title("Hierarchical Clustering Dendrogram", fontsize=24,
weight='bold')
plot_dendrogram(model, truncate_mode="level", p=4, color_threshold=t)
plt.xlabel("Number of points in node "
"(or index of point if no parenthesis).")
plt.show()
print(pd.Series(model.labels_).value_counts().sort_index())
if samples:
for i in range(n_clusters):
print(f'Cluster {i+1}')
try:
display(df
.reset_index().sort_values('rank')
.loc[pd.Series(model.labels_) == i,
['rank', 'song', 'artist']]
.head(n_rows))
# If less than 10
except:
display(df
.reset_index().sort_values('rank')
.loc[pd.Series(model.labels_) == i,
['rank', 'song', 'artist']]
.head(n_rows))
# Add cluster columns to df
df['cluster'] = model.labels_
return df
def plot_wcloud_ac(df, k=300, n_clusters=6):
"""Return an n_topics-2 rows by 2 columns wordclouds of the words
per topic."""
ncols = int(np.ceil(n_clusters/2))
nrows = 2
fig = plt.figure(figsize=(14, nrows*(14/ncols)))
for i in range(n_clusters):
ax = fig.add_subplot(nrows, ncols, i+1)
df_i = df[df['cluster'] == i]
df_i_lyrics = df_i['lyrics'].apply(preprocess_lyrics)
corpus = df_i_lyrics.tolist()
cv = CountVectorizer(stop_words=stopwords, min_df=3,
ngram_range=(1, 1))
x = cv.fit_transform(corpus).toarray()
feature_names = cv.get_feature_names_out()
bows = pd.DataFrame({'count': x.sum(axis=0), 'word': feature_names})
bows_sorted = bows.sort_values('count', ascending=False)[
'word'].tolist()
bow = " ".join(bows_sorted)
wc = WordCloud(background_color='white',
colormap='winter',
max_words=k,
contour_color=c_blue,
contour_width=3,
random_state=42).generate(bow)
plt.title(f"Cluster {i+1}", color=c_red, fontsize=24)
plt.imshow(wc)
plt.axis("off")
plt.tight_layout()
pl = bows.sort_values("count",ascending=False)["word"][:15].to_numpy()
print(f'Top 15 words for Cluster {i+1}:\n'
f'{pl}\n')
Music plays an important role in our society with an implicit effect on emotions, morals, and culture. It is also a way to describe social changes within communities and in the world. Everyday, we hear music as a way to relax, express emotions, exhibit creativity, and to cope with everything that happens with our lives. Different music genres and lyrics connect to us on a different level. Hence, this explains why the global music streaming market size is continuously growing with market size as high as USD 29.45 million in 2022 and is expected to expand with an annual growth rate of 14.7% from 2022 to 2030.
As opposed to how regular playlists are generated in most music streaming sites that utilize genre and existing playlists generated by other users, and given that there is a growing number of huge online societies that are interested and are participating in listening to, obtaining meaning, and expressing their views about various songs' lyrics, such as those hosted by "Last.fm", our team explored clustering songs using moods or social tags related to each song's lyrics to answer the following problem statement:
What are the common themes based on lyrics that prevailed among the top songs from 2006 to 2021?
To aid our analysis, the team applied a methodology involving data collection, data preprocessing using webscraping, data exploration by performing exploratory data analysis, converting the lyrics data from string to numerical values using Term Frequency—Inverse Document Frequency (TF-IDF), using dimensionality reduction method such as truncated Singular Value Decomposition (SVD), and applying clustering methods such as Nonnegative Matrix Factorization, K-Means Clustering, and Agglomerative Hierarchical Clustering to finally provide a thematic view of the dataset.
For this report we opted to use the Year-End Hot-100 Songs from Billboard. Despite the collection being generally diverse, this list only contains the most popular songs from 2006 to 2010 based on the metric set by Billboard. We assume that the best or top songs are representative of the songs population for that year.
Before we proceed with the clustering portion, our team used TF-IDF in converting the lyrics data from string to numerical values. After several iterations and review of the results, we ultimately used min_df=3, max_df=50%, ngram_range=(1,3), and manually indentified/listed stop words as our parameters. This yielded an additional 11,262 dimensions in the dataset. Using such new data frame, our team further divided the data between three eras with years 2006-2010, 2011-2015, and 2016-2021 for further thematic analysis.
To reduce the dimensions while retaining 95% of its meaning, we performed dimensionality reduction using truncated SVD which helped us reduce the dimensions from 11,262 to 1,191.
After exploring the three clustering methods, we were able to arrive at the following conclusion:
Overall/all year clustering results yielded Life, relationships, triumphs, Gangster culture, rhythmic, upbeat, and hiphop, Passionate, extreme love or loneliness, sexual, Rap and sexual, Catchy, easy recall, repetitive, and Dance and party as the main themes.
Whereas era clusters resulted to the following:
2006-2010's main themes are Romantic, Dance and Catchy songs, Passiona, Sexual, and Life.
2011-2015's main themes are Life, Rap + Hip-hop, Love, and Dance and Party.
2016-2021's main themes are Life, Rap + Hip-hop, Love, and Gangster.
With these results, our team recommends that for future studies, songwriters and artists can use lyric-based analysis and classification of the top songs over the years. They can use this to identify keywords and themes that recenlty successful songs have, and strategize what messages they want to include when crafting new songs. The clusters formed by the models in this study can also be used to generate playlists that have one common theme. While for model improvements, our team identified that we can increase the scope of our data by scraping more songs outside of the Hot-100. We can get more insights from the clustering and analyzing songs that aren't as successful as the top songs. We can also utilize metadata that will help identify the social tags of the clusters such as genre, sub-genres, song length, and others. Aside from the bare lyrics, we can use lyric interpretations from Genius.com or other sources. This will give more information about the context of songs, given that some songs don't directly say the message hidden behind the words. And finally, we can also explore using NMF directly without using the truncated SVD since it is also a dimensionality reduction method that could immediately work on large file and provide the most appropriate clusters.
What are the common themes based on lyrics that prevailed among the top songs from 2006 to 2021?
Given that there is a growing number of huge online communities that are interested and are participating in listening to, obtaining meaning, and expressing their views about various songs' lyrics, such as those hosted by "Last.fm", our team would like to explore using the moods or social tags relevant for each song's lyrics to generate insights about the common themes that prevailed among the top songs from 2006 to 2021.
The methodology of this report aims to gather insights from the common themes based on song lyrics that prevailed among the top songs from 2006 to 2021.
The high-level methodology used in this study is as follows:
| No. | Step | Description |
|---|---|---|
| 1. | Data Collection | Scrape the yearly top 100 songs, including the corresponding rank, artist, and year from Billboard.com[1] using BeautifulSoup covering the periods from 2006 to 2021. Then scrape the lyrics related to the aforementioned songs from the Genius.com's[2] lyricsgenius library. |
| 2. | Data Preprocessing | Clean the scraped lyrics by removing common words in a song that do not provide additional context such as "oh", "yeah", "huh", and others by performing data validation, removing stop words and lemmatizing the words. |
| 3. | Data Exploration | Perform Exploratory Data Analysis (EDA) to explore and understand the relationships between our features, identify any trends/patterns, and provide insights regarding the songs scraped. Convert the lyrics into vectorized form and use Term Frequency — Inverse Document Frequency (TF-IDF). |
| 4. | Dimensionality Reduction | Use Truncated SVD to reduce the number of dimensions of the TF-IDF converted lyrics while preserving 95% of the dataset's cumulative explained variance. |
| 5. | Determining Clusters: | For Non-negative Matrix Factorization (NMF), use elbow method of the reconstruction error (Frobenius norm of the matrix difference, between the training data and the reconstructed data from the fitted model) as a basis for finding the optimal K or number of topics/clusters. For K-Means Clustering and Agglomerative Hierarchical Clustering use elbow method of the sum of squared distances as the basis in conjuction with Silhoute score and Davies Bouldin score as the basis for finding the optimal K or clusters. |
| 6. | Cluster Analysis | Perform actual cluster analysis using the optimal k for NMF, K-Means Clustering, and Agglomerative Hierarchical Clustering then provide insights regarding the clusters formed. |
The detailed steps performed related to the above methodology is presented in the Data Exploration and Results and Discussion sections of this document.
The sources of the songs dataset used in this study are Billboard.com's website [1] and Genius.com's[2] lyricsgenius library.
The team scraped all the yearly hot 100 songs' rank, song title, artist, and from 2006 to 2021 from Billboard and combined it with the song lyrics scraped from lyricsgenius library.
After scraping and combining the songs info and lyrics, we were able to gather and use 1598 songs (rows) and five base features (columns) (before vectorization). The following columns were used and considered relevant for this study:
| Column Name | Data Type | Short description |
|---|---|---|
| rank | int64 | Rank of the song for a given year. |
| song | object | Title of the song. |
| artist | object | Name of the singer of the song. |
| year | int64 | Year when the song ranked in the hot 100 songs of Billboard's website[1] |
| lyrics | object | Lyrics of the song. |
For this report we opted to use the Year-End Hot-100 Songs from Billboard. Despite the collection being generally diverse, this list only contains the most popular songs from 2006 to 2010 based on the metric set by Billboard. We assume that the best or top songs are representative of the songs population for that year.
def get_top_100(soup):
"""Returns the rank, song, and artist of the top 100 songs from
Billboard.com"""
songs = [x.text.strip() for x in soup.select('li .c-title')]
artists = [x.text.strip() for x in soup.select('li .c-title ~ .c-label')]
return {rank: {'song': song, 'artist': artist} for rank,
(song, artist) in enumerate(zip(songs, artists), start=1)}
# proxies = {'http': 'http://206.189.157.23'}
# years = range(2006, 2022)
# song_data = {}
# for year in years:
# url = f'https://www.billboard.com/charts/year-end/{year}/hot-100-songs/'
# soup = BeautifulSoup(requests.get(url, proxies=proxies).content)
# song_data[year] = get_top_100(soup)
# df_all_years = pd.DataFrame()
# for year in years:
# df_year = pd.DataFrame(song_data[year]).T.reset_index().rename(
# columns={'index': 'rank'})
# df_year['year'] = year
# df_all_years = pd.concat([df_all_years, df_year])
# df_all_years.to_pickle("all_songs.pkl")
Genius[2] is a website that hosts song lyrics where passages can be highlighted and annotated with interpretations, explanations, and references. However, the Genius API doesn’t provide a way to download the lyrics themselves. The lyricsgenius is a library that was made with Beautiful Soup, this can be used to scrape the lyrics based on a search query.
Created queries by combining song name with first two words in artists
# df_test = df_all_years.copy()
# df_test['query'] = (df_test['song'] + ' ' +
# df_test['artist'].apply(lambda x: " ".join(x.split()[:2])))
# df_test['query'] = df_test['query'].apply(preprocess_query)
Some titles were censored in Billboard and some artists contained words like featuring and x. The following preprocessing function fixes those queries that fail in searching for songs in Genius. Correction of the queries were done using RegEx sub function
def preprocess_query(query):
"""Fixes queries that fail in genius search"""
# Convert to lowercase
query = query.lower()
# Remove at end
query = query.rsplit('featuring', 1)[0]
query = query.rsplit(' x', 1)[0]
query = query.rsplit(' /', 1)[0]
# Replace
query = (re.sub("the black", "black", query))
query = (re.sub("f\*\*k", "fuck", query))
query = (re.sub("ni\*\*a", "nigga", query))
query = (re.sub("b\*\*\*\*", "bitch", query))
query = (re.sub("part 2", "pt 2", query))
query = (re.sub("p\*\$\$y", "pussy", query))
query = (re.sub("ily surf", "ily (i love you baby) surf", query))
query = (re.sub("\(spider-man: into the spider-verse\)", "", query))
query = (re.sub("\(fifty shades darker\)", "", query))
query = (re.sub("it ain't me", "it aint me", query))
query = (re.sub("tik tok ke\$ha", "tik tok kesha", query))
query = (re.sub("jay-z, rihanna", "jayz", query))
return query
Lyricsgenius genius.search_songs method uses the search function within the website and gets the song details of the top result. Like most APIs, it returns the information in JSON format. The id is then extracted from the JSON and is used as the parameter in the genius.lyrics method to get the lyrics.
def get_lyrics(query):
"""Search for a song in Genius and return the lyrics of the top result"""
song_id = genius.search_songs(f"{query}",
per_page=1,
page=1)['hits'][0]['result']['id']
print(f'Got lyrics from query: {query}')
return genius.lyrics(song_id)
The lyrics were scrapped per year and stored in the pickle files.
# df_2006 = df_test[df_test['year'] == 2006].copy()
# df_2007 = df_test[df_test['year'] == 2007].copy()
# df_2008 = df_test[df_test['year'] == 2008].copy()
# df_2009 = df_test[df_test['year'] == 2009].copy()
# df_2010 = df_test[df_test['year'] == 2010].copy()
# df_2011 = df_test[df_test['year'] == 2011].copy()
# df_2012 = df_test[df_test['year'] == 2012].copy()
# df_2013 = df_test[df_test['year'] == 2013].copy()
# df_2014 = df_test[df_test['year'] == 2014].copy()
# df_2015 = df_test[df_test['year'] == 2015].copy()
# df_2015 = df_test[df_test['year'] == 2015].copy()
# df_2016 = df_test[df_test['year'] == 2016].copy()
# df_2017 = df_test[df_test['year'] == 2017].copy()
# df_2018 = df_test[df_test['year'] == 2018].copy()
# df_2019 = df_test[df_test['year'] == 2019].copy()
# df_2020 = df_test[df_test['year'] == 2020].copy()
# df_2021 = df_test[df_test['year'] == 2021].copy()
# df_2006['lyrics'] = df_2006['query'].apply(get_lyrics)
# df_2006.to_pickle('songs_2006.pkl')
# df_2007['lyrics'] = df_2007['query'].apply(get_lyrics)
# df_2007.to_pickle('songs_2007.pkl')
# df_2008['lyrics'] = df_2008['query'].apply(get_lyrics)
# df_2008.to_pickle('songs_2008.pkl')
# df_2009['lyrics'] = df_2009['query'].apply(get_lyrics)
# df_2009.to_pickle('songs_2009.pkl')
# df_2010['lyrics'] = df_2010['query'].apply(get_lyrics)
# df_2010.to_pickle('songs_2010.pkl')
# df_2011['lyrics'] = df_2011['query'].apply(get_lyrics)
# df_2011.to_pickle('songs_2011.pkl')
# df_2012['lyrics'] = df_2012['query'].apply(get_lyrics)
# df_2012.to_pickle('songs_2012.pkl')
# df_2013['lyrics'] = df_2013['query'].apply(get_lyrics)
# df_2013.to_pickle('songs_2013.pkl')
# df_2014['lyrics'] = df_2014['query'].apply(get_lyrics)
# df_2014.to_pickle('songs_2014.pkl')
# df_2015['lyrics'] = df_2015['query'].apply(get_lyrics)
# df_2015.to_pickle('songs_2015.pkl')
# df_2016['lyrics'] = df_2016['query'].apply(get_lyrics)
# df_2016.to_pickle('songs_2016.pkl')
# df_2017['lyrics'] = df_2017['query'].apply(get_lyrics)
# df_2017.to_pickle('songs_2017.pkl')
# df_2018['lyrics'] = df_2018['query'].apply(get_lyrics)
# df_2018.to_pickle('songs_2018.pkl')
# df_2019['lyrics'] = df_2019['query'].apply(get_lyrics)
# df_2019.to_pickle('songs_2019.pkl')
# df_2020['lyrics'] = df_2020['query'].apply(get_lyrics)
# df_2020.to_pickle('songs_2020.pkl')
# df_2021['lyrics'] = df_2021['query'].apply(get_lyrics)
# df_2021.to_pickle('songs_2021.pkl')
years = range(2006, 2022)
df_all = pd.DataFrame()
for year in years:
pickle = f'songs_{year}.pkl'
df = pd.read_pickle(pickle)
df_all = pd.concat([df_all, df])
df_all = (df_all.drop(columns=['query'])).reset_index(drop=True)
display(df_all.head(1))
| rank | song | artist | year | lyrics | |
|---|---|---|---|---|---|
| 0 | 1 | Bad Day | Daniel Powter | 2006 | Bad Day Lyrics\nWhere is the moment we needed the most?\nYou kick up the leaves and the magic is lost\nThey tell me your blue skies fade to gray\nThey tell me your passion's gone away\nAnd I don't need no carryin' on\n\nYou stand in the line just to hit a new low\nYou're faking a smile with the coffee to go\nYou tell me your life's been way off line\nYou're falling to pieces every time\nAnd I don't need no carryin' on\nBecause you had a bad day\nYou're taking one down\nYou sing a sad song just to turn it around\nYou say you don't know\nYou tell me, don't lie\nYou work at a smile and you go for a ride\nYou had a bad day\nThe camera don't lie\nYou're coming back down and you really don't mind\nYou had a bad day\nYou had a bad day\n\nWill you need a blue sky holiday?\nThe point is they laugh at what you say\nAnd I don't need no carryin' on\n\nYou had a bad day\nYou're taking one down\nYou sing a sad song just to turn it around\nYou say you don't know\nYou tell me, don't lie\nYou work at a smile and you go for a ride\nYou had a bad day\nThe camera don't lie\nYou're coming back down and you really don't mind\nYou had a bad day\n(Ooh, a holiday)\nYou might also like\nSometimes the system goes on the blink\nAnd the whole thing turns out wrong\nYou might not make it back and you know\nThat you could be well, oh, that strong\nAnd I'm not wrong\n(Yeah, yeah, yeah)\n\nSo where is the passion when you need it the most?\nOh, you and I\nYou kick up the leaves and the magic is lost\n\n'Cause you had a bad day\nYou're taking one down\nYou sing a sad song just to turn it around\nYou say you don't know\nYou tell me don't lie\nYou work at a smile and you go for a ride\nYou had a bad day\nYou've seen what you're like\nAnd how does it feel?\nOne more time\nYou had a bad day\nYou had a bad day\nAh, yeah, yeah, yeah\nHad a bad day (Ah)\nHad a bad day (ah, yeah, yeah, yeah)\nHad a bad day (Ah)42Embed |
def preprocess_lyrics(lyrics):
"""Returns cleaned lyrics."""
# Convert to lowercase
lyrics = lyrics.lower()
# Removes titles
lyrics = lyrics.split('yrics\n', 1)[-1]
# Removes 'Embed from the end'
lyrics = lyrics.rsplit('embed', 1)[0]
# Removes digits before embed
lyrics = lyrics.rstrip('1234567890')
# Removes 'you might also like'
lyrics = lyrics.rsplit('you might also like', 1)[0]
# Replace dash with space
lyrics = (re.sub("[-]", " ", lyrics))
# Remove Non-letters
lyrics = (re.sub("[^\sA-Za-z]", "", lyrics))
# Standardize whitespace
lyrics = (re.sub("[\s]", " ", lyrics))
# Lemmatizer
lemma = WordNetLemmatizer()
for pos in [wordnet.NOUN, wordnet.VERB, wordnet.ADJ, wordnet.ADV]:
lyrics_clean = " ".join(lemma.lemmatize(word, pos)
for word in lyrics.split())
return lyrics_clean
df_all['lyrics'] = df_all['lyrics'].apply(preprocess_lyrics)
display(df_all.head(1))
| rank | song | artist | year | lyrics | |
|---|---|---|---|---|---|
| 0 | 1 | Bad Day | Daniel Powter | 2006 | where is the moment we needed the most you kick up the leaves and the magic is lost they tell me your blue skies fade to gray they tell me your passions gone away and i dont need no carryin on you stand in the line just to hit a new low youre faking a smile with the coffee to go you tell me your lifes been way off line youre falling to pieces every time and i dont need no carryin on because you had a bad day youre taking one down you sing a sad song just to turn it around you say you dont know you tell me dont lie you work at a smile and you go for a ride you had a bad day the camera dont lie youre coming back down and you really dont mind you had a bad day you had a bad day will you need a blue sky holiday the point is they laugh at what you say and i dont need no carryin on you had a bad day youre taking one down you sing a sad song just to turn it around you say you dont know you tell me dont lie you work at a smile and you go for a ride you had a bad day the camera dont lie youre coming back down and you really dont mind you had a bad day ooh a holiday |
df_info(df_all)
Number of rows: 1598 Number of columns: 5 List of all columns, count of non-null values, and datatypes of the df: <class 'pandas.core.frame.DataFrame'> RangeIndex: 1598 entries, 0 to 1597 Data columns (total 5 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 rank 1598 non-null int64 1 song 1598 non-null object 2 artist 1598 non-null object 3 year 1598 non-null int64 4 lyrics 1598 non-null object dtypes: int64(2), object(3) memory usage: 62.5+ KB
None
Summary statistics for numerical columns:
| rank | year | |
|---|---|---|
| count | 1,598.00 | 1,598.00 |
| mean | 50.44 | 2,013.50 |
| std | 28.84 | 4.61 |
| min | 1.00 | 2,006.00 |
| 25% | 25.25 | 2,009.25 |
| 50% | 50.00 | 2,013.50 |
| 75% | 75.00 | 2,017.75 |
| max | 100.00 | 2,021.00 |
Summary statistics for object(string) columns:
| count | unique | top | freq | |
|---|---|---|---|---|
| song | 1598 | 1395 | Dynamite | 4 |
| artist | 1598 | 910 | Taylor Swift | 24 |
| lyrics | 1598 | 1440 | you can be amazing you can turn a phrase into a weapon or a drug you can be the outcast or be the backlash of somebodys lack of love or you can start speaking up nothings gonna hurt you the way that words do when they settle neath your skin kept on the inside and no sunlight sometimes a shadow wins but i wonder what would happen if you say what you wanna say and let the words fall out honestly i wanna see you be brave with what you want to say and let the words fall out honestly i wanna see you be brave i just wanna see you i just wanna see you i just wanna see you i wanna see you be brave i just wanna see you i just wanna see you i just wanna see you i wanna see you be brave everybodys been there everybodys been stared down by the enemy fallen for the fear and done some disappearing bow down to the mighty dont run stop holding your tongue maybe theres a way out of the cage where you live maybe one of these days you can let the light in and show me how big your brave is | 2 |
Check for null columns:
rank 0 song 0 artist 0 year 0 lyrics 0 dtype: int64
Display % of null columns:
rank 0.00 song 0.00 artist 0.00 year 0.00 lyrics 0.00 dtype: float64
The data frame do not contain null values. No further cleaning and preprocessing steps needed.
Since we are looking at the common themes based on lyrics that prevailed among the top songs from 2006 to 2022, we further segmented the data into three groups containing 5 or 6 years each starting from 2006 to the 2021. By splitting the data into three groups, we can better see significant changes in lyrical themes and how it evolved throughout the years as opposed to comparing them yearly.
df_2006_2010 = df_all[(df_all['year'] >= 2006) & (df_all['year'] <= 2010)]
df_2011_2015 = df_all[(df_all['year'] >= 2011) & (df_all['year'] <= 2015)]
df_2016_2021 = df_all[(df_all['year'] >= 2016) & (df_all['year'] <= 2021)]
The objective of this EDA is to explore the dataset, understand the relationships between our features, identify any trends/patterns regarding the scraped list of lyrics of popular songs from lyricsgenius API.
How many songs do we typically have for each artist?
artist_songs_stats()
Average number of songs per artist: 1.76 Taylor Swift had the most number of songs with 24 songs
Based on Figure 3, the average number of songs per artist fall between 1 to 2 songs indicating that most of the artists appear more than once in the yearly top 100. From 2006 to 2021, Taylor Swift's songs placed within the top 100, 24 times.
How many words do a song lyrics typically have?
n_words_in_lyrics()
Average words in song lyrics: 276 words
On average, a song's lyrics have 276 words.
What words were used by the most number of songs?
For the final part of the EDA, we want to look at the words that are used by most unique songs in the entire database. The top words obtained can either be cluster-defining words, or just noise features that can be present in any cluster.
get_countvec(df_all, 15)
| count | % of songs | |
|---|---|---|
| love | 724 | 45.31 |
| go | 710 | 44.43 |
| see | 626 | 39.17 |
| cant | 623 | 38.99 |
| make | 612 | 38.30 |
| baby | 598 | 37.42 |
| wanna | 588 | 36.80 |
| say | 580 | 36.30 |
| never | 577 | 36.11 |
| time | 575 | 35.98 |
| take | 530 | 33.17 |
| back | 523 | 32.73 |
| way | 516 | 32.29 |
| right | 506 | 31.66 |
| want | 505 | 31.60 |
Overall, love, baby and time were the most used nouns by all the songs in the database. Meanwhile, the most used verbs were go, see and make.
top_words_per_era()
2006-2010 era
| count | % of songs | |
|---|---|---|
| see | 241 | 48.20 |
| go | 226 | 45.20 |
| love | 222 | 44.40 |
| make | 201 | 40.20 |
| wanna | 200 | 40.00 |
2011-2015 era
| count | % of songs | |
|---|---|---|
| get | 241 | 48.30 |
| love | 239 | 47.90 |
| make | 203 | 40.68 |
| go | 200 | 40.08 |
| baby | 194 | 38.88 |
2016-2021 era
| count | % of songs | |
|---|---|---|
| go | 284 | 47.41 |
| love | 263 | 43.91 |
| cant | 246 | 41.07 |
| say | 238 | 39.73 |
| never | 233 | 38.90 |
Throughout all the three eras, love and go were within the top 5 words used by unique songs.
Before we proceed with the clustering portion, we first need to transform the lyrics data from string to numerical values. This was done by transforming the lyrics data by using TF-IDF to vectorize the lyrics and give less importance to words that appear more frequently such as 'a', 'the', 'and', 'or', and other stop_words manually itemized in the utility functions section of this notebook.
To perform TF-IDF, we need to construct a vector space model for the song lyrics resulting in a term document (TD) matrix. Then apply TF-IDF (term frequency-inverse document frequency) weight normalization to TD. Both steps were processed using TFIDFVectorizer from sklearn.
Code used for this step is as follows:
def get_tfidf(df):
"""Converts the song lyrics to a matrix of TF-IDF features. """
corpus = df['lyrics'].tolist()
vectorizer = TfidfVectorizer(
min_df=3, max_df=0.50, ngram_range=(1, 3), stop_words=stopwords)
X = vectorizer.fit_transform(corpus)
feature_names = vectorizer.get_feature_names_out()
return X, feature_names
After several iterations of different combinations of the appropriate parameters that would provide us with results that makes the most sense without removing the essence of the meaning of each song, we arrived at the following final parameters:
ngram_range: use words up to 3 ngrams.
During testing: Using only 1-gram for this resulted to clusters' top words that do not provide real context about the meaning of the song and provided vague meaning even when combined with more than top 30 words. Including bi-grams somehow resulted to the same conclusion as 1-gram. Eventually, after including 3-grams, the results provided more appropriate patterns and meaning about the lyrics of the song. Hence, 1 to 3 n-grams were finally used. ngrams more than 3 were not considered anymore due to computational/memory constraints. Furthermore, even if including bigrams and 3-grams resulted to top words with repeated words (love love love, go go go, etc.), the results still accurately provided context on whether the words should be put between catchy (repetitive) cluster or the appropriate cluster (love, or others) where it should be in.
stopwords variable set in the utility functions section of this notebook. The final list of stopwords was a result of several testing and rechecking of the results of the overall models and methods used in this notebook. The essence or meaning of the songs remained unchanged despite the removal of the stopwords, indicating that those stopwords do not provide additional info or context about the lyrics.X_all, X_all_feature_names = get_tfidf(df_all)
X_0610, X_0610_feature_names = get_tfidf(df_2006_2010)
X_1115, X_1115_feature_names = get_tfidf(df_2011_2015)
X_1621, X_1621_feature_names = get_tfidf(df_2016_2021)
n_dimensions()
All years n_dimensions: 11,262 2006-2010 n_dimensions: 3,380 2011-2015 n_dimensions: 3,125 2016-2021 n_dimensions: 4,404
After using TF-IDF to our lyrics data, it added a total of 11,262 dimensions to our base dataframe. Due to the large size of our dataset, we decided to perform dimensionality reduction. Our goal here was to reduce the number of features from our data without having much information loss. This would enable us to better interpret the data while significantly reducing the time it takes to process them. In this step, we decided to use Truncated Singular Value Decomposition (SVD) since it works better for sparse matrices such as the bag of words of our song lyrics transformed into TF-IDF matrix.
To do this, we performed the following:
def truncated_svd(X):
"""Return the Q, Sigma, and P SVD of the design matrix X as well as the
normalized sum of squared distance from the origin.
"""
q, s, p = np.linalg.svd(X)
s2 = s ** 2
return q, np.diag(s), p.T, (s2 / np.sum(s2))
def project_svd(q, s, k):
"""Returns design matrix projected on to the first k singular vectors"""
return q[:, :k]@s[:k, :k]
`
q_all, s_all, p_all, nssd_all = truncated_svd(X_all.todense())
k_all = plot_sv(nssd_all, 0.95, title='All years')
q_0610, s_0610, p_0610, nssd_0610 = truncated_svd(X_0610.todense())
k_0610 = plot_sv(nssd_0610, 0.95, title='Years 2006-2010')
q_1115, s_1115, p_1115, nssd_1115 = truncated_svd(X_1115.todense())
k_1115 = plot_sv(nssd_1115, 0.95, title='Years 2011-2015')
q_1621, s_1621, p_1621, nssd_1621 = truncated_svd(X_1621.todense())
k_1621 = plot_sv(nssd_1621, 0.95, title='Years 2016-2021')
Our team chose the corresponding number of singular values (SV) based on the explained variance that would retain most of the information from the data. We decided to have a 95% explained variance to retain 95% of the lyrics' meaning or text information. Based on the plots above, we reduced the dimensions from 11,262 to 1,191 SVs for the whole corpus, from 3,380 to 384 SVs for years 2006-2010, from 3,125 to 387 SVs for years 2011-2015, and 4404 to 465 SVs for years 2016-2021.
X_all_lsa = project_svd(q_all, s_all, k_all)
X_0610_lsa = project_svd(q_0610, s_0610, k_0610)
X_1115_lsa = project_svd(q_1115, s_1115, k_1115)
X_1621_lsa = project_svd(q_1621, s_1621, k_1621)
Updated number of dimensions after using project_svd:
updated_n_dimensions()
All years n_dimensions: 1,191 2006-2010 n_dimensions: 384 2011-2015 n_dimensions: 387 2016-2021 n_dimensions: 465
In this study, we explored three methods in clustering the songs based on the prevailing themes or topics of the songs lyrics:
The first step in using NMF is to find the optimal number of topics (clusters) or K.
For interpretability, we only selected K for up to 10 and used elbow method of the reconstruction error (Frobenius norm of the matrix difference, between the training data and the reconstructed data from the fitted model) as a basis for finding the optimal K.
optimal_k_nmf(X_all)
'Optimal K for NMF Topics: 6'
Print the top 15 words for 6 main topics identified using NMF function:
nmf_model_all, U_all, V_all = nmf_topics(X_all, X_all_feature_names, 15)
topic_results = nmf_model_all.transform(X_all)
df_all['topic'] = topic_results.argmax(axis=1)+1
df_all_nmf = nmf_topics_all(df_all)
display(df_all_nmf)
THE TOP 15 WORDS PER TOPIC 1 ['take', 'would', 'see', 'make', 'love', 'back', 'feel', 'way', 'heart', 'life', 'away', 'say', 'cant', 'time', 'never'] THE TOP 15 WORDS PER TOPIC 2 ['man', 'big', 'hit', 'back', 'bitches', 'ass', 'lil', 'real', 'woo', 'money', 'niggas', 'fuck', 'shit', 'bitch', 'nigga'] THE TOP 15 WORDS PER TOPIC 3 ['want love', 'love girl love', 'want', 'hard love', 'love hard', 'love baby', 'lucky', 'love like', 'cant feel', 'hard', 'love girl', 'girl love', 'love love love', 'love love', 'love'] THE TOP 15 WORDS PER TOPIC 4 ['little', 'boy', 'take', 'good', 'see', 'right', 'make', 'baby baby', 'body', 'tonight', 'need', 'want', 'wanna', 'girl', 'baby'] THE TOP 15 WORDS PER TOPIC 5 ['low low', 'go head', 'back', 'dm', 'talking', 'never', 'ready', 'head', 'know know', 'time', 'never go', 'low', 'go go go', 'go go', 'go'] THE TOP 15 WORDS PER TOPIC 6 ['see dance', 'plans', 'night', 'room', 'move move', 'living', 'way', 'floor', 'living room', 'stop', 'hands', 'move', 'dance dance dance', 'dance dance', 'dance']
| 2006-2010 | 2011-2015 | 2016-2021 | |
|---|---|---|---|
| topic | |||
| 1 | 217 | 199 | 206 |
| 2 | 65 | 67 | 177 |
| 3 | 25 | 32 | 28 |
| 4 | 119 | 139 | 124 |
| 5 | 41 | 31 | 42 |
| 6 | 33 | 31 | 22 |
display_topics(df_all, n_topics=6, samples=True, n_samples=3)
Topic 1
| rank | song | artist | |
|---|---|---|---|
| 0 | 1 | Bad Day | Daniel Powter |
| 1298 | 1 | Old Town Road | Lil Nas X Featuring Billy Ray Cyrus |
| 1198 | 1 | God's Plan | Drake |
Topic 2
| rank | song | artist | |
|---|---|---|---|
| 300 | 1 | Boom Boom Pow | The Black Eyed Peas |
| 699 | 1 | Thrift Shop | Macklemore & Ryan Lewis Featuring Wanz |
| 899 | 1 | Uptown Funk! | Mark Ronson Featuring Bruno Mars |
Topic 3
| rank | song | artist | |
|---|---|---|---|
| 1098 | 1 | Shape Of You | Ed Sheeran |
| 705 | 7 | Just Give Me A Reason | P!nk Featuring Nate Ruess |
| 306 | 7 | I'm Yours | Jason Mraz |
Topic 4
| rank | song | artist | |
|---|---|---|---|
| 100 | 1 | Irreplaceable | Beyonce |
| 999 | 1 | Love Yourself | Justin Bieber |
| 501 | 2 | Party Rock Anthem | LMFAO Featuring Lauren Bennett & GoonRock |
Topic 5
| rank | song | artist | |
|---|---|---|---|
| 200 | 1 | Low | Flo Rida Featuring T-Pain |
| 1399 | 2 | Circles | Post Malone |
| 1000 | 2 | Sorry | Justin Bieber |
Topic 6
| rank | song | artist | |
|---|---|---|---|
| 400 | 1 | TiK ToK | Ke$ha |
| 1498 | 1 | Levitating | Dua Lipa |
| 1001 | 3 | One Dance | Drake Featuring WizKid & Kyla |
topic_word_list = plot_word_cloud(nmf_model_all, X_all_feature_names, k_all, 6)
After individually assessing the top words per topic, our team arrived at the following appropriate labels for each topic (cluster):
SUMMARY OF TOPICS
| TOPIC | LABEL |
|---|---|
| Topic 1 | Life, relationships, triumphs |
| Topic 2 | Gangster culture, rhythmic, upbeat, and hiphop |
| Topic 3 | Passionate, extreme love or loneliness, sexual |
| Topic 4 | Rap and sexual |
| Topic 5 | Catchy, easy recall, repetitive |
| Topic 6 | Dance and party |
topic_dict ={1:'Life, relationships, triumphs',
2:'Gangster culture, rhythmic, upbeat, and hiphop',
3:'Passionate, extreme love or loneliness, sexual',
4:'Rap and sexual',
5:'Catchy, easy recall, repetitive',
6:'Dance and party'}
df_all['topic_label'] = df_all['topic'].map(topic_dict)
df_all.head(2)
| rank | song | artist | year | lyrics | topic | topic_label | |
|---|---|---|---|---|---|---|---|
| 0 | 1 | Bad Day | Daniel Powter | 2006 | where is the moment we needed the most you kick up the leaves and the magic is lost they tell me your blue skies fade to gray they tell me your passions gone away and i dont need no carryin on you stand in the line just to hit a new low youre faking a smile with the coffee to go you tell me your lifes been way off line youre falling to pieces every time and i dont need no carryin on because you had a bad day youre taking one down you sing a sad song just to turn it around you say you dont know you tell me dont lie you work at a smile and you go for a ride you had a bad day the camera dont lie youre coming back down and you really dont mind you had a bad day you had a bad day will you need a blue sky holiday the point is they laugh at what you say and i dont need no carryin on you had a bad day youre taking one down you sing a sad song just to turn it around you say you dont know you tell me dont lie you work at a smile and you go for a ride you had a bad day the camera dont lie youre coming back down and you really dont mind you had a bad day ooh a holiday | 1 | Life, relationships, triumphs |
| 1 | 2 | Temperature | Sean Paul | 2006 | oh oh oh oh di gyaldem schillaci sean da paul suh mi give it to suh mi give it to suh mi give it to to all girls five million and forty naughty shorty baby girl all my girls all my girls sean da paul seh well woman the way the time cold i wanna be keepin you warm i got the right temperature fi shelter you from the storm oh lord gyal i got the right tactics to turn you on and girl i wanna be the papa you can be the mom oh oh make i see di gyal dem breakout pon di floor from you dont want no worthless performer oh oh from you dont want no man weh cant turn you on gyal make i see your hand dem up on ya oh oh cant tan pon it long nah eat no yam no steam fish nor no green banana oh oh but down in jamaica we give it to you hot like a sauna well woman the way the time cold i wanna be keepin you warm i got the right temperature fi shelter you from the storm oh lord gyal i got the right tactics to turn you on and girl i wanna be the papa you can be the mom oh oh bumper exposed and gyal ya got your chest out but you no wasters cause gyal ya impress out oh oh and if ya diss out a me ya fi test out cause i got the remedy fi make ya de stress out oh oh mi haffi flaunt it become a god bless out and girl if you want it you haffi confess out oh oh i no lie weh we need set speed haffi test the mattress out well woman the way the time cold i wanna be keepin you warm i got the right temperature fi shelter you from the storm oh lord gyal i got the right tactics to turn you on and girl i wanna be the papa you can be the mom oh oh | 4 | Rap and sexual |
For this step, our team tried different combinations of testing, but to avoid crowding this notebook with large printed dataframes, we'll just show the following instead:
Show 10 samples of songs within the dance and party cluster:
(df_all[df_all['topic_label'] == 'Dance and party']
[['song', 'artist', 'year', 'topic_label']].drop_duplicates('song')
.sort_values(by='topic_label').sample(10, random_state=44))
| song | artist | year | topic_label | |
|---|---|---|---|---|
| 439 | Club Can't Handle Me | Flo Rida Featuring David Guetta | 2010 | Dance and party |
| 297 | Into The Night | Santana Featuring Chad Kroeger | 2008 | Dance and party |
| 24 | Lean Wit It, Rock Wit It | Dem Franchize Boyz Featuring Lil Peanut & Charlay | 2006 | Dance and party |
| 509 | On The Floor | Jennifer Lopez Featuring Pitbull | 2011 | Dance and party |
| 864 | Dirt | Florida Georgia Line | 2014 | Dance and party |
| 838 | Wiggle | Jason Derulo Featuring Snoop Dogg | 2014 | Dance and party |
| 326 | Circus | Britney Spears | 2009 | Dance and party |
| 386 | 3 | Britney Spears | 2009 | Dance and party |
| 496 | Smile | Uncle Kracker | 2010 | Dance and party |
| 1498 | Levitating | Dua Lipa | 2021 | Dance and party |
Famous artist:
Since Taylor Swift is famous for making songs about her relationships and are usually has catchy or repetitive lyrics, we checked where her songs are clustered to:
(df_all[df_all['artist'] == 'Taylor Swift']
[['song', 'year', 'topic_label']]
.drop_duplicates('song').sort_values(by='topic_label'))[:10]
| song | year | topic_label | |
|---|---|---|---|
| 927 | Style | 2015 | Catchy, easy recall, repetitive |
| 811 | Shake It Off | 2014 | Catchy, easy recall, repetitive |
| 188 | Teardrops On My Guitar | 2007 | Life, relationships, triumphs |
| 1221 | Delicate | 2018 | Life, relationships, triumphs |
| 1136 | Look What You Made Me Do | 2017 | Life, relationships, triumphs |
| 955 | Wildest Dreams | 2015 | Life, relationships, triumphs |
| 769 | 22 | 2013 | Life, relationships, triumphs |
| 1336 | You Need To Calm Down | 2019 | Life, relationships, triumphs |
| 714 | I Knew You Were Trouble. | 2013 | Life, relationships, triumphs |
| 572 | Back To December | 2011 | Life, relationships, triumphs |
Based on the above table (and other samples we tested), we conclude that the clustering results of using NMF are appropriate and accurate.
nmf_model_0610, U_0610, V_0610 = nmf_topics_era(X_0610, X_0610_feature_names, 15)
nmf_model_1115, U_1115, V_1115 = nmf_topics_era(X_1115, X_1115_feature_names, 15)
nmf_model_1621, U_1621, V_1621 = nmf_topics_era(X_1621, X_1621_feature_names, 15)
The first step in using K-Means clustering is also to find the optimal number of clusters or K.
For interpretability, we only selected K for up to 10 and used elbow method of the sum of squared distances, Silhouette score, and Davies Bouldin score as our considerations for finding the optimal K.
optimal_k(X_all_lsa, 5)
optimal_k(X_0610_lsa, 5)
optimal_k(X_1115_lsa, 4)
optimal_k(X_1621_lsa, 4)
The following codes will show the K-Means clustered songs, display the tabular distribution of the clusters and sample songs from each cluster, and plot the results.
kmeans_all_labels = nmf_kmeans(X_all, U_all, 5)
df_all['k_cluster'] = kmeans_all_labels
display_k_clusters(df_all, 5)
k_cluster 0 49 1 1292 2 14 3 212 4 31 dtype: int64 Cluster 1
| song | artist | |
|---|---|---|
| 203 | Lollipop | Lil Wayne Featuring Static Major |
| 305 | Right Round | Flo Rida |
| 8 | SexyBack | Justin Timberlake |
| 410 | Nothin' On You | B.o.B Featuring Bruno Mars |
| 411 | I Like It | Enrique Iglesias Featuring Pitbull |
| 413 | In My Head | Jason Derulo |
| 1214 | Sad! | XXXTENTACION |
| 616 | One More Night | Maroon 5 |
| 817 | Let Her Go | Passenger |
| 20 | Move Along | The All-American Rejects |
Cluster 2
| song | artist | |
|---|---|---|
| 0 | Bad Day | Daniel Powter |
| 1298 | Old Town Road | Lil Nas X Featuring Billy Ray Cyrus |
| 200 | Low | Flo Rida Featuring T-Pain |
| 300 | Boom Boom Pow | The Black Eyed Peas |
| 400 | TiK ToK | Ke$ha |
| 1198 | God's Plan | Drake |
| 1398 | Blinding Lights | The Weeknd |
| 500 | Rolling In The Deep | Adele |
| 1498 | Levitating | Dua Lipa |
| 599 | Somebody That I Used To Know | Gotye Featuring Kimbra |
Cluster 3
| song | artist | |
|---|---|---|
| 302 | Just Dance | Lady Gaga Featuring Colby O'Donis |
| 408 | Dynamite | Taio Cruz |
| 1007 | Can't Stop The Feeling! | Justin Timberlake |
| 809 | Timber | Pitbull Featuring Ke$ha |
| 1411 | Dance Monkey | Tones And I |
| 32 | Dance, Dance | Fall Out Boy |
| 542 | Dynamite | Taio Cruz |
| 1146 | Can't Stop The Feeling! | Justin Timberlake |
| 1160 | Juju On That Beat (TZ Anthem) | Zay Hilfigerrr & Zayion McCall |
| 1170 | Chained To The Rhythm | Katy Perry Featuring Skip Marley |
Cluster 4
| song | artist | |
|---|---|---|
| 899 | Uptown Funk! | Mark Ronson Featuring Bruno Mars |
| 1400 | The Box | Roddy Ricch |
| 1101 | Humble. | Kendrick Lamar |
| 1402 | Rockstar | DaBaby Featuring Roddy Ricch |
| 1302 | Wow. | Post Malone |
| 1202 | Rockstar | Post Malone Featuring 21 Savage |
| 1103 | Bad And Boujee | Migos Featuring Lil Uzi Vert |
| 1203 | Psycho | Post Malone Featuring Ty Dolla $ign |
| 1004 | Panda | Desiigner |
| 1404 | Life Is Good | Future Featuring Drake |
Cluster 5
| song | artist | |
|---|---|---|
| 705 | Just Give Me A Reason | P!nk Featuring Nate Ruess |
| 306 | I'm Yours | Jason Mraz |
| 107 | I Wanna Love You | Akon Featuring Snoop Dogg |
| 407 | Bad Romance | Lady Gaga |
| 910 | Can't Feel My Face | The Weeknd |
| 521 | How To Love | Lil Wayne |
| 1420 | 10,000 Hours | Dan + Shay & Justin Bieber |
| 225 | Clumsy | Fergie |
| 824 | Black Widow | Iggy Azalea Featuring Rita Ora |
| 226 | I'm Yours | Jason Mraz |
kmeans_0610_labels = nmf_kmeans(X_0610, U_0610, 5)
df_2006_2010['k_cluster'] = kmeans_0610_labels
display_k_clusters(df_2006_2010, 5)
k_cluster 0 419 1 8 2 20 3 13 4 40 dtype: int64 Cluster 1
| song | artist | |
|---|---|---|
| 0 | Bad Day | Daniel Powter |
| 400 | TiK ToK | Ke$ha |
| 100 | Irreplaceable | Beyonce |
| 300 | Boom Boom Pow | The Black Eyed Peas |
| 200 | Low | Flo Rida Featuring T-Pain |
| 1 | Temperature | Sean Paul |
| 401 | Need You Now | Lady Antebellum |
| 101 | Umbrella | Rihanna Featuring Jay-Z |
| 201 | Bleeding Love | Leona Lewis |
| 301 | Poker Face | Lady Gaga |
Cluster 2
| song | artist | |
|---|---|---|
| 306 | I'm Yours | Jason Mraz |
| 407 | Bad Romance | Lady Gaga |
| 225 | Clumsy | Fergie |
| 226 | I'm Yours | Jason Mraz |
| 427 | Your Love Is My Drug | Ke$ha |
| 134 | Runaway Love | Ludacris Featuring Mary J. Blige |
| 465 | Your Love | Nicki Minaj |
| 396 | Say Hey (I Love You) | Michael Franti & Spearhead Featuring Cherine Anderson |
Cluster 3
| song | artist | |
|---|---|---|
| 203 | Lollipop | Lil Wayne Featuring Static Major |
| 305 | Right Round | Flo Rida |
| 8 | SexyBack | Justin Timberlake |
| 410 | Nothin' On You | B.o.B Featuring Bruno Mars |
| 411 | I Like It | Enrique Iglesias Featuring Pitbull |
| 413 | In My Head | Jason Derulo |
| 20 | Move Along | The All-American Rejects |
| 435 | How Low | Ludacris |
| 45 | Laffy Taffy | D4L |
| 57 | There It Go! (The Whistle Song) | Juelz Santana |
Cluster 4
| song | artist | |
|---|---|---|
| 412 | BedRock | Young Money Featuring Lloyd |
| 113 | Party Like A Rockstar | Shop Boyz |
| 118 | Fergalicious | Fergie |
| 223 | So What | P!nk |
| 24 | Lean Wit It, Rock Wit It | Dem Franchize Boyz Featuring Lil Peanut & Charlay |
| 331 | Let It Rock | Kevin Rudolf Featuring Lil Wayne |
| 39 | Stickwitu | The Pussycat Dolls |
| 344 | So What | P!nk |
| 55 | Do It To It | Cherish Featuring Sean Paul Of The YoungBloodZ |
| 264 | Let It Rock | Kevin Rudolf Featuring Lil Wayne |
Cluster 5
| song | artist | |
|---|---|---|
| 404 | OMG | Usher Featuring will.i.am |
| 304 | Love Story | Taylor Swift |
| 4 | Hips Don't Lie | Shakira Featuring Wyclef Jean |
| 10 | Be Without You | Mary J. Blige |
| 414 | Rude Boy | Rihanna |
| 117 | The Way I Are | Timbaland Featuring Keri Hilson |
| 318 | Kiss Me Thru The Phone | Soulja Boy Tell 'em Featuring Sammie |
| 319 | Down | Jay Sean Featuring Lil Wayne |
| 419 | Imma Be | The Black Eyed Peas |
| 22 | Dani California | Red Hot Chili Peppers |
kmeans_1115_labels = nmf_kmeans(X_1115, U_1115, 4)
df_2011_2015['k_cluster'] = kmeans_1115_labels
display_k_clusters(df_2011_2015, 4)
k_cluster 0 426 1 12 2 51 3 10 dtype: int64 Cluster 1
| song | artist | |
|---|---|---|
| 500 | Rolling In The Deep | Adele |
| 599 | Somebody That I Used To Know | Gotye Featuring Kimbra |
| 799 | Happy | Pharrell Williams |
| 900 | Thinking Out Loud | Ed Sheeran |
| 800 | Dark Horse | Katy Perry Featuring Juicy J |
| 600 | Call Me Maybe | Carly Rae Jepsen |
| 700 | Blurred Lines | Robin Thicke Featuring T.I. + Pharrell |
| 501 | Party Rock Anthem | LMFAO Featuring Lauren Bennett & GoonRock |
| 801 | All Of Me | John Legend |
| 701 | Radioactive | Imagine Dragons |
Cluster 2
| song | artist | |
|---|---|---|
| 705 | Just Give Me A Reason | P!nk Featuring Nate Ruess |
| 910 | Can't Feel My Face | The Weeknd |
| 521 | How To Love | Lil Wayne |
| 824 | Black Widow | Iggy Azalea Featuring Rita Ora |
| 726 | I Love It | Icona Pop Featuring Charli XCX |
| 729 | The Way | Ariana Grande Featuring Mac Miller |
| 632 | As Long As You Love Me | Justin Bieber Featuring Big Sean |
| 954 | Love Me Harder | Ariana Grande & The Weeknd |
| 672 | Let Me Love You (Until You Learn To Love Yourself) | Ne-Yo |
| 784 | Let Me Love You (Until You Learn To Love Yourself) | Ne-Yo |
Cluster 3
| song | artist | |
|---|---|---|
| 699 | Thrift Shop | Macklemore & Ryan Lewis Featuring Wanz |
| 899 | Uptown Funk! | Mark Ronson Featuring Bruno Mars |
| 802 | Fancy | Iggy Azalea Featuring Charli XCX |
| 908 | The Hills | The Weeknd |
| 519 | Look At Me Now | Chris Brown Featuring Lil Wayne & Busta Rhymes |
| 618 | The Motto | Drake Featuring Lil Wayne |
| 720 | Holy Grail | Jay Z Featuring Justin Timberlake |
| 922 | Post To Be | Omarion Featuring Chris Brown & Jhene Aiko |
| 828 | Loyal | Chris Brown Featuring Lil Wayne & French Montana Or Too $hort Or Tyga |
| 630 | Young, Wild & Free | Snoop Dogg & Wiz Khalifa Featuring Bruno Mars |
Cluster 4
| song | artist | |
|---|---|---|
| 616 | One More Night | Maroon 5 |
| 817 | Let Her Go | Passenger |
| 819 | Let It Go | Idina Menzel |
| 736 | One More Night | Maroon 5 |
| 551 | Motivation | Kelly Rowland Featuring Lil Wayne |
| 556 | Rocketeer | Far*East Movement Featuring Ryan Tedder |
| 859 | Trumpets | Jason Derulo |
| 663 | Let's Go | Calvin Harris Featuring Ne-Yo |
| 589 | Please Don't Go | Mike Posner |
| 795 | Let Her Go | Passenger |
kmeans_1621_labels = nmf_kmeans(X_1621, U_1621, 4)
df_2016_2021['k_cluster'] = kmeans_1621_labels
display_k_clusters(df_2016_2021, 4)
k_cluster 0 103 1 362 2 127 3 7 dtype: int64 Cluster 1
| song | artist | |
|---|---|---|
| 1498 | Levitating | Dua Lipa |
| 1098 | Shape Of You | Ed Sheeran |
| 1100 | That's What I Like | Bruno Mars |
| 1501 | Mood | 24kGoldn Featuring iann dior |
| 1503 | Kiss Me More | Doja Cat Featuring SZA |
| 1304 | 7 Rings | Ariana Grande |
| 1504 | Leave The Door Open | Silk Sonic (Bruno Mars & Anderson .Paak) |
| 1006 | Don't Let Me Down | The Chainsmokers Featuring Daya |
| 1506 | Montero (Call Me By Your Name) | Lil Nas X |
| 1206 | In My Feelings | Drake |
Cluster 2
| song | artist | |
|---|---|---|
| 999 | Love Yourself | Justin Bieber |
| 1398 | Blinding Lights | The Weeknd |
| 1198 | God's Plan | Drake |
| 1298 | Old Town Road | Lil Nas X Featuring Billy Ray Cyrus |
| 1199 | Perfect | Ed Sheeran |
| 1099 | Despacito | Luis Fonsi & Daddy Yankee Featuring Justin Bieber |
| 1399 | Circles | Post Malone |
| 1499 | Save Your Tears | The Weeknd & Ariana Grande |
| 1000 | Sorry | Justin Bieber |
| 1299 | Sunflower (Spider-Man: Into The Spider-Verse) | Post Malone & Swae Lee |
Cluster 3
| song | artist | |
|---|---|---|
| 1400 | The Box | Roddy Ricch |
| 1101 | Humble. | Kendrick Lamar |
| 1302 | Wow. | Post Malone |
| 1402 | Rockstar | DaBaby Featuring Roddy Ricch |
| 1203 | Psycho | Post Malone Featuring Ty Dolla $ign |
| 1103 | Bad And Boujee | Migos Featuring Lil Uzi Vert |
| 1004 | Panda | Desiigner |
| 1204 | I Like It | Cardi B, Bad Bunny & J Balvin |
| 1404 | Life Is Good | Future Featuring Drake |
| 1306 | Sicko Mode | Travis Scott |
Cluster 4
| song | artist | |
|---|---|---|
| 1007 | Can't Stop The Feeling! | Justin Timberlake |
| 1411 | Dance Monkey | Tones And I |
| 1146 | Can't Stop The Feeling! | Justin Timberlake |
| 1160 | Juju On That Beat (TZ Anthem) | Zay Hilfigerrr & Zayion McCall |
| 1170 | Chained To The Rhythm | Katy Perry Featuring Skip Marley |
| 1474 | Only Human | Jonas Brothers |
| 1375 | Only Human | Jonas Brothers |
df_all = display_clusters(X_all_lsa, df_all, 5, samples=True)
plot_wcloud_ac(df_all, k_all, 5)
0 363 1 270 2 129 3 288 4 548 dtype: int64 Cluster 1
| rank | song | artist | |
|---|---|---|---|
| 699 | 1 | Thrift Shop | Macklemore & Ryan Lewis Featuring Wanz |
| 899 | 1 | Uptown Funk! | Mark Ronson Featuring Bruno Mars |
| 301 | 2 | Poker Face | Lady Gaga |
Cluster 2
| rank | song | artist | |
|---|---|---|---|
| 1298 | 1 | Old Town Road | Lil Nas X Featuring Billy Ray Cyrus |
| 200 | 1 | Low | Flo Rida Featuring T-Pain |
| 300 | 1 | Boom Boom Pow | The Black Eyed Peas |
Cluster 3
| rank | song | artist | |
|---|---|---|---|
| 1198 | 1 | God's Plan | Drake |
| 1098 | 1 | Shape Of You | Ed Sheeran |
| 201 | 2 | Bleeding Love | Leona Lewis |
Cluster 4
| rank | song | artist | |
|---|---|---|---|
| 1398 | 1 | Blinding Lights | The Weeknd |
| 500 | 1 | Rolling In The Deep | Adele |
| 599 | 1 | Somebody That I Used To Know | Gotye Featuring Kimbra |
Cluster 5
| rank | song | artist | |
|---|---|---|---|
| 0 | 1 | Bad Day | Daniel Powter |
| 1498 | 1 | Levitating | Dua Lipa |
| 799 | 1 | Happy | Pharrell Williams |
Top 15 words for Cluster 1: ['like' 'got' 'know' 'get' 'bitch' 'nigga' 'shit' 'go' 'fuck' 'back' 'money' 'see' 'girl' 'want' 'baby'] Top 15 words for Cluster 2: ['like' 'got' 'know' 'go' 'get' 'wanna' 'make' 'back' 'girl' 'rock' 'see' 'time' 'good' 'take' 'right'] Top 15 words for Cluster 3: ['love' 'like' 'know' 'baby' 'got' 'want' 'girl' 'say' 'cant' 'never' 'get' 'keep' 'need' 'take' 'tell'] Top 15 words for Cluster 4: ['love' 'know' 'baby' 'like' 'got' 'say' 'cant' 'never' 'want' 'need' 'see' 'get' 'wanna' 'girl' 'feel'] Top 15 words for Cluster 5: ['know' 'like' 'go' 'wanna' 'got' 'time' 'baby' 'never' 'make' 'cant' 'get' 'say' 'take' 'way' 'right']
df_2006_2010 = display_clusters(X_0610_lsa, df_2006_2010, 5, samples=True)
plot_wcloud_ac(df_2006_2010, k_0610, 5)
0 170 1 178 2 44 3 44 4 64 dtype: int64 Cluster 1
| rank | song | artist | |
|---|---|---|---|
| 401 | 2 | Need You Now | Lady Antebellum |
| 201 | 2 | Bleeding Love | Leona Lewis |
| 3 | 4 | You're Beautiful | James Blunt |
Cluster 2
| rank | song | artist | |
|---|---|---|---|
| 400 | 1 | TiK ToK | Ke$ha |
| 300 | 1 | Boom Boom Pow | The Black Eyed Peas |
| 200 | 1 | Low | Flo Rida Featuring T-Pain |
Cluster 3
| rank | song | artist | |
|---|---|---|---|
| 301 | 2 | Poker Face | Lady Gaga |
| 302 | 3 | Just Dance | Lady Gaga Featuring Colby O'Donis |
| 404 | 5 | OMG | Usher Featuring will.i.am |
Cluster 4
| rank | song | artist | |
|---|---|---|---|
| 2 | 3 | Promiscuous | Nelly Furtado Featuring Timbaland |
| 403 | 4 | California Gurls | Katy Perry Featuring Snoop Dogg |
| 207 | 8 | Love In This Club | Usher Featuring Young Jeezy |
Cluster 5
| rank | song | artist | |
|---|---|---|---|
| 0 | 1 | Bad Day | Daniel Powter |
| 100 | 1 | Irreplaceable | Beyonce |
| 101 | 2 | Umbrella | Rihanna Featuring Jay-Z |
Top 15 words for Cluster 1: ['know' 'say' 'like' 'love' 'cant' 'time' 'never' 'got' 'see' 'life' 'away' 'make' 'get' 'take' 'way'] Top 15 words for Cluster 2: ['like' 'got' 'go' 'know' 'get' 'wanna' 'make' 'see' 'right' 'girl' 'boy' 'baby' 'rock' 'back' 'take'] Top 15 words for Cluster 3: ['love' 'know' 'like' 'girl' 'cant' 'got' 'baby' 'want' 'see' 'keep' 'way' 'time' 'get' 'wanna' 'right'] Top 15 words for Cluster 4: ['baby' 'know' 'love' 'girl' 'like' 'got' 'want' 'get' 'wanna' 'take' 'ba' 'see' 'boy' 'go' 'right'] Top 15 words for Cluster 5: ['like' 'know' 'right' 'get' 'make' 'go' 'time' 'got' 'take' 'say' 'wanna' 'cant' 'way' 'back' 'never']
df_2011_2015 = display_clusters(X_1115_lsa, df_2011_2015, 4, samples=True, t=0.6)
plot_wcloud_ac(df_2011_2015, k_1115, 4)
0 204 1 119 2 87 3 89 dtype: int64 Cluster 1
| rank | song | artist | |
|---|---|---|---|
| 0 | 1 | Rolling In The Deep | Adele |
| 299 | 1 | Happy | Pharrell Williams |
| 300 | 2 | Dark Horse | Katy Perry Featuring Juicy J |
Cluster 2
| rank | song | artist | |
|---|---|---|---|
| 199 | 1 | Thrift Shop | Macklemore & Ryan Lewis Featuring Wanz |
| 399 | 1 | Uptown Funk! | Mark Ronson Featuring Bruno Mars |
| 200 | 2 | Blurred Lines | Robin Thicke Featuring T.I. + Pharrell |
Cluster 3
| rank | song | artist | |
|---|---|---|---|
| 99 | 1 | Somebody That I Used To Know | Gotye Featuring Kimbra |
| 400 | 2 | Thinking Out Loud | Ed Sheeran |
| 401 | 3 | See You Again | Wiz Khalifa Featuring Charlie Puth |
Cluster 4
| rank | song | artist | |
|---|---|---|---|
| 1 | 2 | Party Rock Anthem | LMFAO Featuring Lauren Bennett & GoonRock |
| 203 | 5 | Can't Hold Us | Macklemore & Ryan Lewis Featuring Ray Dalton |
| 304 | 6 | Talk Dirty | Jason Derulo Featuring 2 Chainz |
Top 15 words for Cluster 1: ['like' 'know' 'never' 'go' 'come' 'got' 'wanna' 'baby' 'make' 'time' 'cant' 'feel' 'love' 'get' 'back'] Top 15 words for Cluster 2: ['like' 'got' 'know' 'baby' 'get' 'girl' 'want' 'go' 'love' 'nigga' 'money' 'make' 'wanna' 'back' 'shit'] Top 15 words for Cluster 3: ['love' 'like' 'baby' 'know' 'got' 'girl' 'never' 'need' 'cant' 'feel' 'want' 'way' 'make' 'hard' 'see'] Top 15 words for Cluster 4: ['like' 'get' 'know' 'go' 'got' 'wanna' 'little' 'night' 'good' 'rock' 'back' 'girl' 'baby' 'tell' 'make']
df_2016_2021 = display_clusters(X_1621_lsa, df_2016_2021, 4, samples=True, t=0.5)
plot_wcloud_ac(df_2016_2021, k_1621, 4)
0 152 1 144 2 118 3 185 dtype: int64 Cluster 1
| rank | song | artist | |
|---|---|---|---|
| 401 | 3 | The Box | Roddy Ricch |
| 102 | 4 | Humble. | Kendrick Lamar |
| 303 | 5 | Wow. | Post Malone |
Cluster 2
| rank | song | artist | |
|---|---|---|---|
| 0 | 1 | Love Yourself | Justin Bieber |
| 199 | 1 | God's Plan | Drake |
| 99 | 1 | Shape Of You | Ed Sheeran |
Cluster 3
| rank | song | artist | |
|---|---|---|---|
| 299 | 1 | Old Town Road | Lil Nas X Featuring Billy Ray Cyrus |
| 100 | 2 | Despacito | Luis Fonsi & Daddy Yankee Featuring Justin Bieber |
| 202 | 4 | Havana | Camila Cabello Featuring Young Thug |
Cluster 4
| rank | song | artist | |
|---|---|---|---|
| 499 | 1 | Levitating | Dua Lipa |
| 399 | 1 | Blinding Lights | The Weeknd |
| 200 | 2 | Perfect | Ed Sheeran |
Top 15 words for Cluster 1: ['like' 'got' 'know' 'bitch' 'get' 'nigga' 'shit' 'fuck' 'go' 'niggas' 'back' 'cant' 'want' 'money' 'lil'] Top 15 words for Cluster 2: ['love' 'know' 'like' 'baby' 'want' 'need' 'wanna' 'got' 'say' 'never' 'tell' 'time' 'get' 'cant' 'go'] Top 15 words for Cluster 3: ['like' 'got' 'doo' 'know' 'back' 'gang' 'go' 'dance' 'work' 'cant' 'get' 'woo' 'low' 'high' 'way'] Top 15 words for Cluster 4: ['know' 'like' 'go' 'cant' 'got' 'say' 'baby' 'way' 'love' 'never' 'need' 'get' 'time' 'take' 'wanna']
In this part, we assign the social tag associated with the clusters formed. These tags will be based on the top words and what that implies about the cluster. However, in the cases that there is no discernible theme just from the top words, the top-ranking words were used as basis for the social tag.
Cluster 1 is labeled as Romantic, with Love as one of its top words.
Cluster 2 is primarily composed of Dance and Catchy Songs. Looking at the top-ranking songs, it has lyrics that are easy to follow and are mostly repetitive
Cluster 3 also has Love as one of its top words however, what separates it from the previous cluster is that it is labeled as more Passionate
Cluster 4, similar to two previous clusters, is about Love. The songs in this cluster are more intimate and it was given a Sexual theme.
Cluster 5 was the cluster with the most indistinct songs, the topics were varying but it can be said that it is mostly about Life and tries to be relatable to the audience
Cluster 1 has the Life social tag similar to the previous era. It mostly contains songs that people can relate to and has no significant top words.
Cluster 2 is a new cluster that emerged in this era, Rap + Hip-hop. The top words include the expletives and money which can be associated with the said genre, upon validating using the top-ranking songs, they are mostly sang by rappers.
Cluster 3 continues the three clusters about Love from the previous era. This time, all songs about love are found in a single cluster.
Cluster 4 contains songs that are expected to be played during Dancing or Parties. Some of the top words are night and rock explain the setting of such parties
Cluster 1 is the biggest cluster in this era. It also represents the social tag Life which has been present in the last two eras.
Cluster 2 are mostly Rap and Hip-hop songs which became popular in the previous era.
Cluster 3 is about Love. Along with the Life cluster, this cluster is also present throughout all eras.
Cluster 4 was labeled as Gangster, which is very similar with Cluster Rap and Hip-hop. A top word was Gang which was not top in the other similar cluster.
Future Work
Songwriters and artists can use lyric-based analysis and classification of the top songs over the years. They can use this to identify keywords and themes that recenlty successful songs have, and strategize what messages they want to include when crafting new songs.
The clusters formed by the models in this study can be used to generate playlists that have one common theme.
Model Improvements
Increase the scope of our data by scraping more songs outside of the Hot-100. We can get more insights from the clustering and analyzing songs that aren't as successful as the top songs.
Utilize metadata that will help identify the social tags of the clusters such as genre, sub-genres, song length, and others.
Aside from the bare lyrics, we can use lyric interpretations from Genius.com or other sources. This will give more information about the context of songs, given that some songs don't directly say the message hidden behind the words.
In this report the truncated SVD matrix was used for the Non-Negative Matrix Factorization clustering so that all three clustering methods had the same input. However, it is possible to use the raw TF-IDF matrix for NMF since it is also a dimensionality reduction method.
We were able to successfully group different Hot-100 Billboard Songs from 2006 to 2021 into clusters and identify each cluster by social tags.
After exploring the three clustering methods, we were able to arrive at the following conclusion:
Overall/all year clustering results yielded Life, relationships, triumphs, Gangster culture, rhythmic, upbeat, and hiphop, Passionate, extreme love or loneliness, sexual, Rap and sexual, Catchy, easy recall, repetitive, and Dance and party as the main themes.
Whereas era clusters resulted to the following:
2006-2010's main themes are Romantic, Dance and Catchy songs, Passiona, Sexual, and Life.
2011-2015's main themes are Life, Rap + Hip-hop, Love, and Dance and Party.
2016-2021's main themes are Life, Rap + Hip-hop, Love, and Gangster.
Also, it is worth highlighting the following notes:
NMF: clustering the dataset using the optimal k number of topics identified through NMF provided clustering results that are consistent and with no overlap.
K-means Clustering: this model was not able to properly segregate the data which could be due to the existence of outliers, high dimensionality, and sensitivity to the density of each cluster.
Agglomerative Hierarchical Clustering: Was able to provide cleaner clustering than Kmeans, however, there were still some overlaps in the social tags available to each cluster.
Overall, NMF performed the best among the three models used in terms of answering our problem statement.
[1] YEAR-END CHARTS Hot 100 Songs. (n.d). Billboard Website. https://www.billboard.com/charts/year-end/2021/hot-100-songs/. (accessed 2022, November)
[2] Getting Started. (n.d). Genius API Website. https://lyricsgenius.readthedocs.io/en/master/index.html. (accessed 2022, November)
END